library(arrow)
library(tidyverse)
library(Hmisc)
library(glue)

domain_sample_sd <- function (urls_df, d, engagement_var = "shares") {
    d <- filter(urls_df, domain==d) |> select(`-2`:`2`)
    w <- colSums(d)
    x <- Hmisc::wtd.var(seq(-1, 1, length.out = 5), w)
    return(sqrt(x))
    }

df <- read_feather("data/final_df.feather") |>
    # NA ideology doesn't contribute to the score, so drop everywhere
    filter(!is.na(political_page_affinity)) |>
    # handle any remaining negative counts
    mutate(across(views:reacts, \(x) if_else(x < 0, abs(x), x))) |>
    rename(domain = parent_domain) #|>


compute_score <- function(df, engagement_var="shares", domain=FALSE) {
    if (domain == TRUE) {
        grouped_df <- group_by(df, domain, political_page_affinity)
        score_var <- glue("domain_{engagement_var}_score")
        sd_var <- glue("domain_{engagement_var}_sd")
        sum_var <- glue("num_domain_{engagement_var}")
    } else {
        grouped_df <- group_by(df, domain, clean_url, political_page_affinity)
        score_var <- glue("{engagement_var}_score")
        sum_var <- glue("num_{engagement_var}")
    }
    agg <- grouped_df |> 
        rename(value = {{engagement_var}}) |>
        mutate(num = ((political_page_affinity/2) * value)) |>
        ungroup(political_page_affinity) |>
        summarise({{score_var}} := sum(num)/sum(value), 
                  {{sum_var}} := sum(value)) |>
        ungroup()
    if (domain == TRUE) {
        wide_df <- pivot_wider(df,
                               id_cols = c(domain, clean_url),
                               names_from = political_page_affinity, 
                               values_from = {{engagement_var}})
        agg  <- agg |>
            mutate({{sd_var}} := map_dbl(domain, \(x) domain_sample_sd(wide_df, x)))
    }
    return(agg)                
    }

# domains <- df |> 
#     filter(!is.na(political_page_affinity)) |> 
#     select(parent_domain, political_page_affinity, shares) |> 
#     mutate(shares = if_else(shares < 0, abs(shares), shares)) |>
#     group_by(parent_domain, political_page_affinity) |>
#     dplyr::mutate(num = ((political_page_affinity/2)*shares)) |>
#     group_by(parent_domain) |>
#     dplyr::summarize(domain_score = sum(num)/sum(shares), n_shares = sum(shares)) |>
#     rename(domain = parent_domain) |>
#     ungroup()


score_list <- list(
    "url_share_scores" = compute_score(df, engagement_var = "shares", domain=FALSE),
    "url_view_scores" = compute_score(df, engagement_var = "views", domain=FALSE),
    "url_react_scores" = compute_score(df, engagement_var = "reacts", domain=FALSE),
    "url_click_scores" = compute_score(df, engagement_var = "clicks", domain=FALSE),
    "domain_share_scores" = compute_score(df, engagement_var = "shares", domain=TRUE),
    "domain_view_scores" = compute_score(df, engagement_var = "views", domain=TRUE),
    "domain_react_scores" = compute_score(df, engagement_var = "reacts", domain=TRUE),
    "domain_click_scores" = compute_score(df, engagement_var = "clicks", domain=TRUE)
)

score_df <- reduce(score_list, inner_join) |> 
    group_by(domain) |>
    mutate(num_domain_urls = n(),
           num_domain_shares = sum(num_shares)) |>
    ungroup()

# url_df <- df |> 
#     filter(!is.na(political_page_affinity)) |> 
#     select(parent_domain, political_page_affinity, clean_url, shares) |> 
#     mutate(shares = if_else(shares < 0, abs(shares), shares)) |>
#     group_by(parent_domain, clean_url, political_page_affinity) |>
#     dplyr::mutate(num = ((political_page_affinity/2)*shares)) |>
#     group_by(parent_domain, clean_url) |>
#     dplyr::summarize(url_score = sum(num)/sum(shares), n_shares = sum(shares)) |>
#     rename(domain = parent_domain) |>
#     ungroup()

# url_df <- df |> 
#     filter(!is.na(political_page_affinity)) |> 
#     select(domain, political_page_affinity, clean_url, shares) |> 
#     mutate(shares = if_else(shares < 0, abs(shares), shares)) |>
#     group_by(domain, clean_url, political_page_affinity) |>
#     dplyr::mutate(num = ((political_page_affinity/2)*shares)) |>
#     group_by(domain, clean_url) |>
#     dplyr::summarize(url_score = sum(num)/sum(shares), n_shares = sum(shares)) |>
#     ungroup()

# url_df <- df |>
#     filter(!is.na(political_page_affinity)) |>
#     select(domain = parent_domain, clean_url, shares, political_page_affinity) |>
#     pivot_wider(names_from=political_page_affinity, values_from = shares) |> 
#     inner_join(url_df, by = join_by(domain == domain, clean_url == clean_url))

# domains <- domains |> mutate(domain_sd = map_dbl(domain, \(x) domain_sample_sd(url_df, x)))
# head(domains)

# url_df <- left_join(url_df, 
#                     rename(domains, domain_shares = n_shares), 
#                     by = join_by(domain == domain))


add_analytic_ci <- function(u, source, m = 0.1, 
                            url_count_var = "n_shares", domain_count_var = "domain_shares",
                            url_score_var = "url_score", domain_score_var = "domain_score", 
                            domain_sd_var = "domain_sd", ci_var = "ci") {
    subset_df <- as_tibble(filter(u, domain == source))
    
    min_count <- min(subset_df[[url_count_var]], na.rm = TRUE)
    # min_count must be at least 11 (minimum value across all platforms)
    min_count <- max(min_count, 11)
    max_count <- subset_df[[domain_count_var]][1]
    max_count <- min(max_count, 100000000)

    ci_df <- tibble(n = unique(u[[url_count_var]]))
#    ci_df <- tibble(n = seq(min_count, max_count))
    domain_score <- subset_df[[domain_score_var]][1]
    domain_sd <- subset_df[[domain_sd_var]][1]
    # TODO: this is wonky - should find a more elegant way to 
    # trim ci_df
    ci_df <- filter(ci_df, n %in% unique(u[[url_count_var]]))

    ci_df <- ci_df |>
        mutate(ci.upper = domain_score + (2.57 * domain_sd / sqrt(n)),
              ci.lower = domain_score - (2.57 * domain_sd /sqrt(n)))
    ci_df$ci.upper[ci_df$ci.upper > 1] <- NA
    ci_df$ci.lower[ci_df$ci.lower < -1] <- NA

    ci_df <- ci_df |>
        mutate(ci.upper.tost = ci.upper + m, ci.lower.tost = ci.lower - m)
    ci_df$ci.upper.tost[ci_df$ci.upper.tost > 1] <- NA
    ci_df$ci.lower.tost[ci_df$ci.lower.tost < -1] <- NA
   
   u <- subset_df |>
    left_join(ci_df, by = join_by({{url_count_var}} == n)) |>
    rename(score := {{ url_score_var }}) |>
    mutate(
        stat_sig = pmap_int(
            list(s = score, l = ci.lower, u = ci.upper), 
            \(s, l, u) ((s > coalesce(u, 1)) || (s < coalesce(l, -1)))),
        sub_sig_left = pmap_int(
            list(s = score, l = ci.lower.tost, u = ci.upper.tost), 
            \(s, l, u) ((s < coalesce(l, -1)))),
        sub_sig_right = pmap_int(
            list(s = score, l = ci.lower.tost, u = ci.upper.tost), 
            \(s, l, u) ((s > coalesce(u, 1)))),
    ) |>
    rename ({{url_score_var}} := score)
    #u$domain <- source
    u$sig_level <- NA
    u$sig_level[(u$stat_sig == 0)] <- "Not Statistically Significant"
    u$sig_level[(u$stat_sig == 1)&(u$sub_sig_left == 0)&(u$sub_sig_right == 0)] <- "Statistically But Not Substantively Significant"
    u$sig_level[(u$sub_sig_left == 1)] <- "Substantively Significantly Left"
    u$sig_level[(u$sub_sig_right == 1)] <- "Substantively Significantly Right"
    u$sig_level <- as.factor(u$sig_level)
    u <- select(u, -domain)
    u <- u |> 
        nest({{ci_var}} := all_of(c("ci.upper", "ci.lower", "ci.upper.tost", "ci.lower.tost", 
                             "stat_sig", "sub_sig_left", "sub_sig_right", "sig_level")))
    
    return(u)

}

# url_df <- url_df |> 
#     group_by(domain) |> 
#     group_modify(~ add_analytic_ci(.x, .y$domain), .keep=TRUE) |> 
#     ungroup()


# head(score_df)

score_df <- score_df |> 
#score_df |> 
    group_by(domain) |> 
    group_modify(~ add_analytic_ci(.x, .y$domain, 
                                   url_count_var = "num_shares", 
                                   domain_count_var = "num_domain_shares",
                                   url_score_var = "shares_score",
                                   domain_score_var = "domain_shares_score", 
                                   domain_sd_var = "domain_shares_sd",
                                   ci_var="shares_ci"), .keep=TRUE) |> 
    group_modify(~ add_analytic_ci(.x, .y$domain, 
                                   url_count_var = "num_views", 
                                   domain_count_var = "num_domain_views",
                                   url_score_var = "views_score",
                                   domain_score_var = "domain_views_score", 
                                   domain_sd_var = "domain_views_sd",
                                   ci_var="views_ci"), .keep=TRUE) |> 
    group_modify(~ add_analytic_ci(.x, .y$domain, 
                                   url_count_var = "num_clicks", 
                                   domain_count_var = "num_domain_clicks",
                                   url_score_var = "clicks_score",
                                   domain_score_var = "domain_clicks_score", 
                                   domain_sd_var = "domain_clicks_sd",
                                   ci_var="clicks_ci"), .keep=TRUE) |> 
    group_modify(~ add_analytic_ci(.x, .y$domain, 
                                   url_count_var = "num_reacts", 
                                   domain_count_var = "num_domain_reacts",
                                   url_score_var = "reacts_score",
                                   domain_score_var = "domain_reacts_score", 
                                   domain_sd_var = "domain_reacts_sd",
                                   ci_var="reacts_ci"), .keep=TRUE) |> 
    ungroup()

domains <- distinct(select(score_df, contains("domain")))

url_df <- score_df

save(list = c("url_df", "domains"), file = "data/plotting_data.RData")
